Welcome to Project 2, this inculdes dataset for Brain Stroke Data!!
# Importing the libraries
import numpy as np
import pandas as pd
from numpy import math
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv("data_TV.csv")
df.head()
first_air_date | origin_country | original_language | name | popularity | vote_average | vote_count | overview | |
---|---|---|---|---|---|---|---|---|
0 | 2021-09-03 | US | en | The D'Amelio Show | 30.104 | 9.0 | 3071 | From relative obscurity and a seemingly normal... |
1 | 2008-01-20 | US | en | Breaking Bad | 468.253 | 8.8 | 10131 | When Walter White, a New Mexico chemistry teac... |
2 | 2021-11-06 | US | en | Arcane | 95.667 | 8.7 | 2615 | Amid the stark discord of twin cities Piltover... |
3 | 2013-12-02 | US | en | Rick and Morty | 1511.996 | 8.7 | 7220 | Rick is a mentally-unbalanced but scientifical... |
4 | 2022-04-14 | US | en | The Kardashians | 195.038 | 8.7 | 1627 | The family you know and love is here with a br... |
#last 5 rows
df.tail()
first_air_date | origin_country | original_language | name | popularity | vote_average | vote_count | overview | |
---|---|---|---|---|---|---|---|---|
2612 | 2002-06-11 | US | en | American Idol | 34.052 | 5.2 | 135 | Each year, hopeful singers from all over the c... |
2613 | 2000-07-05 | US | en | Big Brother | 47.029 | 4.9 | 190 | American version of the reality game show whic... |
2614 | 1997-03-31 | GB | en | Teletubbies | 36.875 | 4.1 | 108 | Pre-school fun, fantasy and education with col... |
2615 | 1985-02-19 | GB | en | EastEnders | 108.720 | 3.9 | 183 | The everyday lives of working-class residents ... |
2616 | 2006-10-09 | CA | fr | La Job | 6.968 | 0.6 | 162 | La Job is a French Canadian comedy television ... |
# Checking the shape of the dataframe
df.shape
(2617, 8)
# Columns name
df.columns
Index(['first_air_date', 'origin_country', 'original_language', 'name', 'popularity', 'vote_average', 'vote_count', 'overview'], dtype='object')
# Checking data types
df.dtypes
first_air_date object origin_country object original_language object name object popularity float64 vote_average float64 vote_count int64 overview object dtype: object
#Checking Null Values
df.isnull().sum()
first_air_date 6 origin_country 0 original_language 0 name 0 popularity 0 vote_average 0 vote_count 0 overview 65 dtype: int64
#total null values
df.isnull().sum().sum()
71
There are 71 null values in the dataset, 6 null values in first_air_date and 65 null values in the overview. So we need to handle the null values
df = df.dropna()
df.shape
(2548, 8)
df.isnull().sum()
first_air_date 0 origin_country 0 original_language 0 name 0 popularity 0 vote_average 0 vote_count 0 overview 0 dtype: int64
df.isna().sum()
first_air_date 0 origin_country 0 original_language 0 name 0 popularity 0 vote_average 0 vote_count 0 overview 0 dtype: int64
# Information about the dataset
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 2548 entries, 0 to 2616 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 first_air_date 2548 non-null object 1 origin_country 2548 non-null object 2 original_language 2548 non-null object 3 name 2548 non-null object 4 popularity 2548 non-null float64 5 vote_average 2548 non-null float64 6 vote_count 2548 non-null int64 7 overview 2548 non-null object dtypes: float64(2), int64(1), object(5) memory usage: 179.2+ KB
df.describe()
popularity | vote_average | vote_count | |
---|---|---|---|
count | 2548.000000 | 2548.000000 | 2548.000000 |
mean | 60.672048 | 7.691915 | 613.074961 |
std | 225.264021 | 0.621754 | 1237.841064 |
min | 0.866000 | 0.600000 | 99.000000 |
25% | 16.767500 | 7.300000 | 151.000000 |
50% | 27.756500 | 7.700000 | 259.500000 |
75% | 50.548500 | 8.100000 | 573.000000 |
max | 6684.611000 | 9.000000 | 19459.000000 |
import seaborn as sns # for making visualizations
from matplotlib import pyplot as plt # for making visualizations
df['original_language'].value_counts()
en 1682 ja 395 es 246 ko 99 fr 18 tr 18 pt 17 de 13 it 10 zh 10 sv 6 da 6 no 5 ru 5 hi 4 th 2 ca 2 he 2 pl 2 is 2 ar 2 tl 1 nl 1 Name: original_language, dtype: int64
#countplot to visualize the number of original language in type column
sns.countplot(df['original_language'])
C:\Users\dnbha\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='original_language', ylabel='count'>
df['vote_average'].value_counts()
7.6 190 8.0 170 7.9 157 7.5 156 7.8 153 8.1 152 8.2 142 7.7 141 8.3 136 7.4 131 7.3 122 7.1 99 7.2 99 8.4 90 8.5 90 8.6 89 7.0 76 8.7 75 6.9 70 6.8 51 6.7 35 6.6 28 6.5 18 6.4 15 6.3 14 6.2 8 6.1 7 5.8 7 6.0 5 5.6 4 5.9 4 5.5 3 5.7 3 4.9 1 3.9 1 4.1 1 9.0 1 5.2 1 5.3 1 8.8 1 0.6 1 Name: vote_average, dtype: int64
#countplot to visualize the number of average vote in column
plt.figure(figsize=(23,10))
sns.countplot(df['vote_average'])
C:\Users\dnbha\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='vote_average', ylabel='count'>
plt.figure(figsize=(23,10))
sns.countplot(df['origin_country'])
C:\Users\dnbha\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='origin_country', ylabel='count'>
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
# Change categorical fields to numerical fields
label_encoder = LabelEncoder()
df_train_obj = df.select_dtypes("object")
df_train_nonobj = df.select_dtypes(exclude=['object'])
df_train_obj = df_train_obj.apply(LabelEncoder().fit_transform)
# mash together
df_train = pd.concat([df_train_obj, df_train_nonobj], axis=1)
pca = PCA(n_components=3, random_state=7)
pca_mdl = pca.fit_transform(df_train)
df_pca = pd.DataFrame(pca_mdl)
# imports
from sklearn.cluster import KMeans
# calculate distortion for a range of number of cluster
distortions = []
for i in range(1, 11):
km = KMeans(
n_clusters=i, init='random',
n_init=10, max_iter=300,
tol=1e-04, random_state=0
)
km.fit(df_pca)
distortions.append(km.inertia_)
# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()
# our model
km = KMeans(
n_clusters=4, init='random',
n_init=10, max_iter=300,
tol=1e-04, random_state=0
)
y_km = km.fit_predict(df_pca)
#Using 4 clusters again, visualize for 3 dimensions
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(projection = '3d')
x = df_pca[0]
y = df_pca[1]
z = df_pca[2]
ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
ax.set_zlabel("PCA 3")
ax.scatter(x, y, z, c = y_km)
plt.show()
# calculate distortion for a range of number of cluster
distortions = []
for i in range(1, 11):
km = KMeans(
n_clusters=i, init='random',
n_init=10, max_iter=300,
tol=1e-04, random_state=0
)
km.fit(df_train)
distortions.append(km.inertia_)
# plot
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()
# Our model
km = KMeans(
n_clusters=4, init='random',
n_init=10, max_iter=300,
tol=1e-04, random_state=0
)
y_km = km.fit_predict(df_train)
# Model it
sns.countplot(x="first_air_date", data=df_train, hue=y_km)
plt.xticks(rotation=90)
plt.gcf().set_size_inches(5, 5)
# Grab random entry from our data set
sample = df_train.sample()
# rerun to generate different samples
sample
first_air_date | origin_country | original_language | name | overview | popularity | vote_average | vote_count | |
---|---|---|---|---|---|---|---|---|
513 | 523 | 53 | 4 | 201 | 2444 | 68.872 | 8.2 | 1183 |
df_train.shape
(2548, 8)
# find out what cluster it belongs to
label = y_km[sample.index[0]]
# pick random movie from cluster
recommend_show = df_train.loc[np.random.choice(np.where(y_km == label)[0])]
# Display our selection
df.iloc[recommend_show.name]
first_air_date 2017-03-19 origin_country CA original_language en name Anne with an E popularity 117.989 vote_average 8.7 vote_count 4064 overview A coming-of-age story about an outsider who, a... Name: 6, dtype: object